import xml.sax, sys
from xml.sax.handler import *

class POSExtractor(ContentHandler):
    def __init__(self):
        self._sentences = []
        self._words = []
        self._characters = u''
        self._current_sent_nb = None
        self._current_word_POS = None
        self._current_word_lemm = None
        self._in_w = 0
        
    def startElement(self, name, attrs):
        if name == "SENT":
            self._current_sent_nb = attrs.get("nb")
        if name=="w":
            self._in_w += 1
            if self._in_w == 1:
                self._current_word_POS = attrs.get("cat")
                self._current_word_lemm = attrs.get("lemma")

        
    def endElement(self, name):
        if name=="SENT":
            sent = self._words
            self._sentences += [(sent, self._current_sent_nb)]
            self._words = []
        if name=="w":
            self._in_w -= 1
            if self._in_w == 0:
                word = self._characters.strip()
                if word:
                    self._words += [(word, self._current_word_POS, self._current_word_lemm)]
                self._characters = ""
            
        
    def characters(self, ch):
        self._characters += ch

    def get_data(self):
        return self._sentences


def XML_to_POS(xmlstring):
    handler = POSExtractor()
    xml.sax.parseString(xmlstring.encode("latin-1"), handler)
    return handler.get_data()

